######################################################################################
# This Julia script is used to read and organize the Google Ngrams data into separated
# files for each language and year.
#
# It was run with Julia 0.6.
# The organize part in particular can take a long time so multiple parallel workers 
# are helpful.
#######################################################################################

# install packages if needed
Pkg.add(["Logging", "CSV", "Libz", "DataFrames", "BufferedStreams", "PyCall"])

include("consts.jl")

import GoogleNgrams

# download 5-grams for all languages
for lang in langs
  GoogleNgrams.gbndownload(5,lang)
end

# download total_counts
mkpath(joinpath(gndir,"totals"))
for lang in langs
  download("http://storage.googleapis.com/books/ngrams/books/googlebooks-$lang-all-totalcounts-20120701.txt", joinpath(totalsdir,"googlebooks-$lang-all-totalcounts-20120701.txt"))
end

# clean totalcounts for all languages
GoogleNgrams.cleantotal.(langs)

# An example of how to organize a single language and to restart a run if the process gets killed along the way:
# @time GoogleNgrams.organize(langs[2],5; startfile="googlebooks-eng-gb-all-5gram-20120701-th.gz", makeuprun=true)

include("addworkers.jl")
@everywhere using GoogleNgrams

# organize all languages
@parallel (+) for lang in langs
  info("analyzing $lang ...")
  if lang in ["eng-gb","rus"]
    GoogleNgrams.organize(lang,5; clean=true)
  else
    GoogleNgrams.organize(lang,5; clean=false, partsonly=true)
  end
  1
end
